{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6c0d5938-3a83-42d9-bf90-c5cad8949bff",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 设置pip源\n",
    "!pip config set global.index-url https://mirrors.aliyun.com/pypi/simple\n",
    "# 设置apt源\n",
    "! echo \"deb http://mirrors.163.com/ubuntu/ focal main restricted universe multiverse\" >> /etc/apt/sources.list\n",
    "! apt update -y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5834674a-ac83-4c20-986f-602af81fcd65",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 安装python包\n",
    "!pip install pandas numpy matplotlib seaborn scikit-learn statsmodels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e43791c2-5d05-4e0c-926e-7baa95519ae0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "data = pd.read_csv('data-test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d3d92b9-ceb9-43c9-8881-2d54647793f6",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 查看数据集的前几行：\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80f448b3-ee2e-4a5c-af08-b7eed139587e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 查看数据集的基本信息：\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91910111-0db1-4d56-a510-8082daf97b2a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 查看数据集的统计描述：\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee4623fd-10e1-47d8-b5e0-883ccb06c6ce",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 检查缺失值：\n",
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48ccf0cd-eb3d-4dc5-870d-6f2b3c50debe",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 可视化缺失值\n",
    "sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap='viridis')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55c2443c-b616-4a5c-91f7-797c8cbc8132",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 检查重复行\n",
    "duplicated_rows = data.duplicated().sum()\n",
    "print(\"重复行的数量:\", duplicated_rows)\n",
    "\n",
    "# 查找每个特征的唯一值数量\n",
    "unique_values = data.nunique()\n",
    "print(\"\\n\\n每个特征唯一值个数:\\n\", unique_values)\n",
    "\n",
    "# 查找每个特征的唯一值百分比\n",
    "unique_percentage = (unique_values / len(data)) * 100\n",
    "print(\"\\n\\n每个特征唯一值百分比:\\n\", unique_percentage)\n",
    "\n",
    "# 可视化唯一值数量\n",
    "plt.figure(figsize=(12, 6))\n",
    "unique_values.plot(kind='bar')\n",
    "plt.title('每个特征唯一值个数')\n",
    "plt.xlabel('特征')\n",
    "plt.ylabel('唯一值个数')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6bf0159a-32a9-4bff-97f2-f5bf91b8e9d4",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 可视化数据集中数值特征的分布：\n",
    "num_features = data.select_dtypes(include=[np.number])\n",
    "\n",
    "for feature in num_features.columns:\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    sns.histplot(data[feature], kde=True)\n",
    "    plt.title(f'Distribution of {feature}')\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9199763b-3951-49e6-b728-b07b33293b2e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 可视化数据集中分类特征的分布：\n",
    "\n",
    "cat_features = data.select_dtypes(include=[np.object])\n",
    "\n",
    "for feature in cat_features.columns:\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    sns.countplot(x=feature, data=data)\n",
    "    plt.title(f'Distribution of {feature}')\n",
    "    plt.xticks(rotation=45)\n",
    "    plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a53c28f8-08e8-4f6e-9b0c-2271403fc579",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 计算数值特征之间的相关性：\n",
    "numeric_data = data.select_dtypes(include=[np.int64,np.float64])\n",
    "corr_matrix = numeric_data.corr()\n",
    "\n",
    "# 可视化相关性矩阵：\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')\n",
    "plt.title('Correlation Matrix')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5ffa843-ae71-4f00-92c5-e93e091ea6a1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 目标变量与特征之间的关系可视化\n",
    "target_variable = 'y'\n",
    "for column in data.columns:\n",
    "    print(column)\n",
    "    if data[column].dtype != 'object' and column != target_variable:\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        sns.boxplot(data=data, x=target_variable, y=column)\n",
    "        plt.title(f\"{target_variable} vs {column}\")\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c591d56-4283-4c5a-90b2-5585b29f8070",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 箱型图（查看异常值）\n",
    "for feature in data.columns:\n",
    "    if data[feature].dtype != 'object':\n",
    "        plt.figure(figsize=(10,5))\n",
    "        sns.boxplot(data[feature])\n",
    "        plt.title(f'{feature} Boxplot')\n",
    "        plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "44a6493c-9746-4e30-8310-db107cce317d",
   "metadata": {},
   "source": [
    "# 单变量分析\n",
    "单变量分析是对一个变量进行分析，以了解其分布、中心趋势和离散程度。我们可以使用直方图、箱形图和密度图等方法。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab77f3d9-0cd0-4e7e-a901-edc22c8b2482",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "column_name='age'\n",
    "# 直方图\n",
    "sns.histplot(data[column_name])\n",
    "plt.show()\n",
    "\n",
    "# 箱形图\n",
    "sns.boxplot(data[column_name])\n",
    "plt.show()\n",
    "\n",
    "# 密度图\n",
    "sns.kdeplot(data[column_name])\n",
    "plt.show()\n",
    "\n",
    "# 描述性统计\n",
    "data[column_name].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3038f027-2f3c-4b0f-858c-1a65b8e022d4",
   "metadata": {},
   "source": [
    "# 双变量分析\n",
    "双变量分析是对两个变量之间的关系进行分析。我们可以使用散点图、相关系数和回归图等方法。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ba0ba2d-7872-414e-b116-4120b6360485",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "column1 = 'age'\n",
    "column2 = 'duration'\n",
    "# 散点图\n",
    "sns.scatterplot(x=column1, y=column2, data=data)\n",
    "plt.show()\n",
    "\n",
    "# 相关系数\n",
    "data[[column1, column2]].corr()\n",
    "\n",
    "# 回归图\n",
    "sns.regplot(x=column1, y=column2, data=data)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1f80d83-186d-4423-aa71-ebb80945527c",
   "metadata": {},
   "source": [
    "# 多变量分析\n",
    "多变量分析涉及到三个或更多变量之间的关系。我们可以使用热力图、成对图和散点图矩阵等方法。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f842414-411e-4a95-af4a-1af724c18904",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 热力图\n",
    "numeric_data = data.select_dtypes(include=[np.int64,np.float64])\n",
    "corr_matrix = numeric_data.corr()\n",
    "sns.heatmap(corr_matrix, annot=True)\n",
    "plt.show()\n",
    "\n",
    "# 成对图\n",
    "sns.pairplot(data)\n",
    "plt.show()\n",
    "\n",
    "# 散点图矩阵\n",
    "sns.pairplot(data, hue='y')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "85998151-b7b1-4e40-bbbc-0a8b70d817d5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[NbConvertApp] Converting notebook data-explore.ipynb to html\n",
      "[NbConvertApp] Writing 4629660 bytes to data-explore.html\n"
     ]
    }
   ],
   "source": [
    "# jupyter 转 html\n",
    "\n",
    "!jupyter nbconvert --to html data-explore.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0c44473-348e-4bb4-b341-58102bf5d2e3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 安装html转markdown\n",
    "\n",
    "!pip install html2text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "925b46db-df22-44f0-af92-bc597069e98a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import html2text\n",
    "\n",
    "# 从文件读取 HTML 内容\n",
    "html_content = open(\"data-explore.html\", \"r\", encoding=\"utf-8\").read()\n",
    "\n",
    "# 初始化转换器\n",
    "converter = html2text.HTML2Text()\n",
    "\n",
    "# 转换 HTML 为 Markdown\n",
    "markdown_content = converter.handle(html_content)\n",
    "\n",
    "# 将转换后的 Markdown 保存到文件\n",
    "with open(\"data-explore.md\", \"w\", encoding=\"utf-8\") as file:\n",
    "    file.write(markdown_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1ec739c-6f59-419a-94d8-7c417269881e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# 将markdown文件转为docx\n",
    "\n",
    "! apt-get install -y pandoc\n",
    "! pip install pypandoc==1.12 pdfkit==1.0.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "4a32a476-a53a-4e18-8df4-ee4741af568b",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "\n",
    "import pypandoc\n",
    "\n",
    "output = pypandoc.convert_file(\"data-explore.md\", 'docx', outputfile=\"data-explore.docx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95f6c858-57e5-47a9-97ac-5714019a55c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将html转为pdf\n",
    "\n",
    "# 直接安装\n",
    "! apt-get install -y wkhtmltopdf\n",
    "# 下载 离线包安装\n",
    "# ! apt-get install  xfonts-encodings xfonts-utils xfonts-base xfonts-75dpi\n",
    "# ! wget https://github.com/wkhtmltopdf/packaging/releases/download/0.12.6-1/wkhtmltox_0.12.6-1.focal_amd64.deb\n",
    "# ! dpkg -i wkhtmltox_0.12.6-1.focal_amd64.deb\n",
    "! wkhtmltopdf --version\n",
    "#! apt-get install -y pandoc texlive-xetex\n",
    "#!jupyter nbconvert --to pdf data-explore.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "21a0e72e-2bfc-4110-96b7-c1b34b0cadc5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将html转为pdf\n",
    "\n",
    "import pdfkit\n",
    "\n",
    "html = open(\"data-explore.html\").read()\n",
    "\n",
    "pdfkit.from_string(html, 'data-explore.pdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a05e5401-bb93-4eed-bd56-8abf92cf5a99",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
